2  bulkRNA: KEGG Gene Set Expression

2.1 Set up workspace

# Libraries
library(msigdbr)
library(viridis)
Loading required package: viridisLite
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(ggplot2)
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ lubridate 1.9.4     ✔ tibble    3.2.1
✔ purrr     1.0.4     ✔ tidyr     1.3.1
✔ readr     2.1.5     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ComplexHeatmap)
Loading required package: grid
========================================
ComplexHeatmap version 2.18.0
Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
Github page: https://github.com/jokergoo/ComplexHeatmap
Documentation: http://jokergoo.github.io/ComplexHeatmap-reference

If you use it in published research, please cite either one:
- Gu, Z. Complex Heatmap Visualization. iMeta 2022.
- Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
    genomic data. Bioinformatics 2016.


The new InteractiveComplexHeatmap package can directly export static 
complex heatmaps into an interactive Shiny app with zero effort. Have a try!

This message can be suppressed by:
  suppressPackageStartupMessages(library(ComplexHeatmap))
========================================
library(circlize)
========================================
circlize version 0.4.16
CRAN page: https://cran.r-project.org/package=circlize
Github page: https://github.com/jokergoo/circlize
Documentation: https://jokergoo.github.io/circlize_book/book/

If you use it in published research, please cite:
Gu, Z. circlize implements and enhances circular visualization
  in R. Bioinformatics 2014.

This message can be suppressed by:
  suppressPackageStartupMessages(library(circlize))
========================================

2.2 Load pt and projecTILs colors

pt_cols <- readRDS("Part0_Patient_Fill.rds")
pt_fill <- readRDS("Part0_Patient_Fill.rds")

2.3 Load RSEM results

p101_rsem <- read.table("/jsimonlab/users/chloetu/melanoma/Melanoma_Eryn/analysis_093024/objects/bulkRNA/rsem_gene_results/18279-101.rsem.genes.results.gz", header = TRUE) %>%
  mutate(gene_id_stable = str_split_i(gene_id, fixed("."), 1)) %>%
  dplyr::rename("P101" = "TPM")
p103_rsem <- read.table("/jsimonlab/users/chloetu/melanoma/Melanoma_Eryn/analysis_093024/objects/bulkRNA/rsem_gene_results/18279-103.rsem.genes.results.gz", header = TRUE) %>%
  mutate(gene_id_stable = str_split_i(gene_id, fixed("."), 1)) %>%
  dplyr::rename("P103" = "TPM")
p104_rsem <- read.table("/jsimonlab/users/chloetu/melanoma/Melanoma_Eryn/analysis_093024/objects/bulkRNA/rsem_gene_results/18279-104.rsem.genes.results.gz", header = TRUE) %>%
  mutate(gene_id_stable = str_split_i(gene_id, fixed("."), 1)) %>%
  dplyr::rename("P104" = "TPM")
p105_rsem <- read.table("/jsimonlab/users/chloetu/melanoma/Melanoma_Eryn/analysis_093024/objects/bulkRNA/rsem_gene_results/18279-105.rsem.genes.results.gz", header = TRUE) %>%
  mutate(gene_id_stable = str_split_i(gene_id, fixed("."), 1)) %>%
  dplyr::rename("P105" = "TPM")
p106_rsem <- read.table("/jsimonlab/users/chloetu/melanoma/Melanoma_Eryn/analysis_093024/objects/bulkRNA/rsem_gene_results/18279-106.rsem.genes.results.gz", header = TRUE) %>%
  mutate(gene_id_stable = str_split_i(gene_id, fixed("."), 1)) %>%
  dplyr::rename("P106" = "TPM")
p108_rsem <- read.table("/jsimonlab/users/chloetu/melanoma/Melanoma_Eryn/analysis_093024/objects/bulkRNA/rsem_gene_results/18279-108.rsem.genes.results.gz", header = TRUE) %>%
  mutate(gene_id_stable = str_split_i(gene_id, fixed("."), 1)) %>%
  dplyr::rename("P108" = "TPM")
p109_rsem <- read.table("/jsimonlab/users/chloetu/melanoma/Melanoma_Eryn/analysis_093024/objects/bulkRNA/rsem_gene_results/18279-109_Tumor_WES_01-18279-109_Normal_WES_01.rsem.genes.results.gz", header = TRUE) %>%
  mutate(gene_id_stable = str_split_i(gene_id, fixed("."), 1)) %>%
  dplyr::rename("P109" = "TPM")
p110_rsem <- read.table("/jsimonlab/users/chloetu/melanoma/Melanoma_Eryn/analysis_093024/objects/bulkRNA/rsem_gene_results/18279-110_Tumor_WES_01-18279-110-Normal_WES_01.rsem.genes.results.gz", header = TRUE) %>%
  mutate(gene_id_stable = str_split_i(gene_id, fixed("."), 1)) %>%
  dplyr::rename("P110" = "TPM")
p111_rsem <- read.table("/jsimonlab/users/chloetu/melanoma/Melanoma_Eryn/analysis_093024/objects/bulkRNA/rsem_gene_results/18279-111_Tumor_WES_01-18279-111-Normal_WES_01.rsem.genes.results.gz", header = TRUE) %>%
  mutate(gene_id_stable = str_split_i(gene_id, fixed("."), 1)) %>%
  dplyr::rename("P111" = "TPM")

2.4 Load AntigenProcessingPresentation, MHC class I, MHC class II pathways

antigen_processing_geneset <- msigdbr(species = "Homo sapiens") %>%
  filter(gs_name == "KEGG_ANTIGEN_PROCESSING_AND_PRESENTATION") %>%
  select(human_gene_symbol, ensembl_gene)

# HLA class I (A-C)
hla_class1 <- msigdbr(species = "Homo sapiens") %>%
  distinct(gene_symbol, ensembl_gene) %>%
  filter(str_detect(gene_symbol, "^HLA-(A|B|C)"))

# HLA class II (ie any HLA that is HLA-D*)
hla_class2 <- msigdbr(species = "Homo sapiens") %>%
  distinct(gene_symbol, ensembl_gene) %>%
  filter(str_detect(gene_symbol, "^HLA-(?=D)"))

# Remove genes that are in HLA class II or are HLA A-C from the KEGG antigen processing gene list since it's a duplication
antigen_processing_geneset <- antigen_processing_geneset %>%
  filter(!(human_gene_symbol %in% hla_class1$gene_symbol),
         !(human_gene_symbol %in% hla_class2$gene_symbol))

2.5 Extract KEGG antigen processing genes

p101_antigen_processing_rsem <- p101_rsem %>%
  filter(gene_id_stable %in% antigen_processing_geneset$ensembl_gene)
p103_antigen_processing_rsem <- p103_rsem %>%
  filter(gene_id_stable %in% antigen_processing_geneset$ensembl_gene)
p104_antigen_processing_rsem <- p104_rsem %>%
  filter(gene_id_stable %in% antigen_processing_geneset$ensembl_gene)
p105_antigen_processing_rsem <- p105_rsem %>%
  filter(gene_id_stable %in% antigen_processing_geneset$ensembl_gene)
p106_antigen_processing_rsem <- p106_rsem %>%
  filter(gene_id_stable %in% antigen_processing_geneset$ensembl_gene)
p108_antigen_processing_rsem <- p108_rsem %>%
  filter(gene_id_stable %in% antigen_processing_geneset$ensembl_gene)
p109_antigen_processing_rsem <- p109_rsem %>%
  filter(gene_id_stable %in% antigen_processing_geneset$ensembl_gene)
p110_antigen_processing_rsem <- p110_rsem %>%
  filter(gene_id_stable %in% antigen_processing_geneset$ensembl_gene)
p111_antigen_processing_rsem <- p111_rsem %>%
  filter(gene_id_stable %in% antigen_processing_geneset$ensembl_gene)

all_antigen_processing_rsem <- p101_antigen_processing_rsem[,c("gene_id", "gene_id_stable", "P101")] %>%
  full_join(p103_antigen_processing_rsem[,c("gene_id", "gene_id_stable", "P103")], by = c("gene_id", "gene_id_stable")) %>%
  full_join(p104_antigen_processing_rsem[,c("gene_id", "gene_id_stable", "P104")], by = c("gene_id", "gene_id_stable")) %>%
  full_join(p105_antigen_processing_rsem[,c("gene_id", "gene_id_stable", "P105")], by = c("gene_id", "gene_id_stable")) %>%
  full_join(p106_antigen_processing_rsem[,c("gene_id", "gene_id_stable", "P106")], by = c("gene_id", "gene_id_stable")) %>%
  full_join(p108_antigen_processing_rsem[,c("gene_id", "gene_id_stable", "P108")], by = c("gene_id", "gene_id_stable")) %>%
  full_join(p109_antigen_processing_rsem[,c("gene_id", "gene_id_stable", "P109")], by = c("gene_id", "gene_id_stable")) %>%
  full_join(p110_antigen_processing_rsem[,c("gene_id", "gene_id_stable", "P110")], by = c("gene_id", "gene_id_stable")) %>%
  full_join(p111_antigen_processing_rsem[,c("gene_id", "gene_id_stable", "P111")], by = c("gene_id", "gene_id_stable"))  %>%
  mutate(Type = "AntigenProcessingPresentation")

## For the 4 pt: Remove lowly expressed genes, ie. gene was expressed (>=1 TPM) in at least one patient
best_antigen_processing_rsem_4pt <- all_antigen_processing_rsem %>%
  rowwise() %>%
  filter((P101 >= 1) | (P103 >= 1) | (P104 >= 1) | (P108 >= 1))

## For all pt: Remove lowly expressed genes, ie. gene was expressed (>=1 TPM) in at least one patient
best_antigen_processing_rsem_9pt <- all_antigen_processing_rsem %>%
  rowwise() %>%
  filter((P101 >= 1) | (P103 >= 1) | (P104 >= 1) |  (P105 >= 1) | (P106 >= 1) | (P108 >= 1) | (P109 >= 1) | (P110 >= 1) | (P111 >= 1))

2.6 Extract top HLA class I (ie HLA-A, HLA-B, HLA-C)

p101_class1_rsem <- p101_rsem %>%
  filter(gene_id_stable %in% hla_class1$ensembl_gene)
p103_class1_rsem <- p103_rsem %>%
  filter(gene_id_stable %in% hla_class1$ensembl_gene)
p104_class1_rsem <- p104_rsem %>%
  filter(gene_id_stable %in% hla_class1$ensembl_gene)
p105_class1_rsem <- p105_rsem %>%
  filter(gene_id_stable %in% hla_class1$ensembl_gene)
p106_class1_rsem <- p106_rsem %>%
  filter(gene_id_stable %in% hla_class1$ensembl_gene)
p108_class1_rsem <- p108_rsem %>%
  filter(gene_id_stable %in% hla_class1$ensembl_gene)
p109_class1_rsem <- p109_rsem %>%
  filter(gene_id_stable %in% hla_class1$ensembl_gene)
p110_class1_rsem <- p110_rsem %>%
  filter(gene_id_stable %in% hla_class1$ensembl_gene)
p111_class1_rsem <- p111_rsem %>%
  filter(gene_id_stable %in% hla_class1$ensembl_gene)

all_class1_rsem <- p101_class1_rsem[,c("gene_id", "gene_id_stable", "P101")] %>%
  full_join(p103_class1_rsem[,c("gene_id", "gene_id_stable", "P103")], by = c("gene_id", "gene_id_stable")) %>%
  full_join(p104_class1_rsem[,c("gene_id", "gene_id_stable", "P104")], by = c("gene_id", "gene_id_stable")) %>%
  full_join(p105_class1_rsem[,c("gene_id", "gene_id_stable", "P105")], by = c("gene_id", "gene_id_stable")) %>%
  full_join(p106_class1_rsem[,c("gene_id", "gene_id_stable", "P106")], by = c("gene_id", "gene_id_stable")) %>%
  full_join(p108_class1_rsem[,c("gene_id", "gene_id_stable", "P108")], by = c("gene_id", "gene_id_stable")) %>%
  full_join(p109_class1_rsem[,c("gene_id", "gene_id_stable", "P109")], by = c("gene_id", "gene_id_stable")) %>%
  full_join(p110_class1_rsem[,c("gene_id", "gene_id_stable", "P110")], by = c("gene_id", "gene_id_stable")) %>%
  full_join(p111_class1_rsem[,c("gene_id", "gene_id_stable", "P111")], by = c("gene_id", "gene_id_stable")) %>%
  mutate(Type = "ClassI")

2.7 Extract HLA class II (ie any HLA-D*)

p101_class2_rsem <- p101_rsem %>%
  filter(gene_id_stable %in% hla_class2$ensembl_gene)
p103_class2_rsem <- p103_rsem %>%
  filter(gene_id_stable %in% hla_class2$ensembl_gene)
p104_class2_rsem <- p104_rsem %>%
  filter(gene_id_stable %in% hla_class2$ensembl_gene)
p105_class2_rsem <- p105_rsem %>%
  filter(gene_id_stable %in% hla_class2$ensembl_gene)
p106_class2_rsem <- p106_rsem %>%
  filter(gene_id_stable %in% hla_class2$ensembl_gene)
p108_class2_rsem <- p108_rsem %>%
  filter(gene_id_stable %in% hla_class2$ensembl_gene)
p109_class2_rsem <- p109_rsem %>%
  filter(gene_id_stable %in% hla_class2$ensembl_gene)
p110_class2_rsem <- p110_rsem %>%
  filter(gene_id_stable %in% hla_class2$ensembl_gene)
p111_class2_rsem <- p111_rsem %>%
  filter(gene_id_stable %in% hla_class2$ensembl_gene)

all_class2_rsem <- p101_class2_rsem[,c("gene_id", "gene_id_stable", "P101")] %>%
  full_join(p103_class2_rsem[,c("gene_id", "gene_id_stable", "P103")], by = c("gene_id", "gene_id_stable")) %>%
  full_join(p104_class2_rsem[,c("gene_id", "gene_id_stable", "P104")], by = c("gene_id", "gene_id_stable")) %>%
  full_join(p105_class2_rsem[,c("gene_id", "gene_id_stable", "P105")], by = c("gene_id", "gene_id_stable")) %>%
  full_join(p106_class2_rsem[,c("gene_id", "gene_id_stable", "P106")], by = c("gene_id", "gene_id_stable")) %>%
  full_join(p108_class2_rsem[,c("gene_id", "gene_id_stable", "P108")], by = c("gene_id", "gene_id_stable")) %>%
  full_join(p109_class2_rsem[,c("gene_id", "gene_id_stable", "P109")], by = c("gene_id", "gene_id_stable")) %>%
  full_join(p110_class2_rsem[,c("gene_id", "gene_id_stable", "P110")], by = c("gene_id", "gene_id_stable")) %>%
  full_join(p111_class2_rsem[,c("gene_id", "gene_id_stable", "P111")], by = c("gene_id", "gene_id_stable"))  %>%
  mutate(Type = "ClassII")

## For the 4 pt: Remove lowly expressed genes, ie. gene was expressed (>=1 TPM) in at least one patient
best_class2_rsem_4pt <- all_class2_rsem %>%
  rowwise() %>%
  filter((P101 >= 1) | (P103 >= 1) | (P104 >= 1) | (P108 >= 1))

2.8 Plot classI, II, and antigen presentation from main 4 patients in a boxplot for Fig S7D

best_4pt.boxplot <- best_antigen_processing_rsem_4pt %>%
  rbind(all_class1_rsem) %>%
  rbind(best_class2_rsem_4pt) %>%
  ungroup() %>%
  mutate(Type = str_replace(Type, "AntigenProcessingPresentation", "AntigenProcessing\nPresentation"))

boxplot <- best_4pt.boxplot %>%
  select("P101", "P103", "P104", "P108", Type) %>%
  pivot_longer(cols = c("P101", "P103", "P104", "P108"), names_to = "Patient") %>%
  ggplot(aes(x = Patient, y = value, fill = Patient)) +
    geom_boxplot() +
    geom_point() +
    scale_y_log10() +
  facet_wrap(~Type, scales = "free_y") +
  theme_bw() +
  pt_fill +
  annotation_logticks(sides='l', outside = TRUE) +
  theme(axis.text.y = element_text(margin = margin(r = 7))) +
  ylab("TPM") +
  coord_cartesian(clip = "off")

boxplot
Warning in scale_y_log10(): log-10 transformation introduced infinite values.
log-10 transformation introduced infinite values.
Warning: Removed 13 rows containing non-finite outside the scale range
(`stat_boxplot()`).

# Get gene symbols
colnames(antigen_processing_geneset) <- c("gene_symbol", "ensembl_gene")
genes <- do.call(rbind, list(antigen_processing_geneset, hla_class1, hla_class2))

boxplot_genelist_4pt <- best_4pt.boxplot %>%
  select(gene_id, gene_id_stable, P101, P103, P104, P108, Type) %>%
  left_join(genes, by = c("gene_id_stable" = "ensembl_gene"))

2.9 Create a heatmap of the antigen processing, class I and class II presentation

Adding class I and class II genes first so any genes in both classI/II and AntigenProcessingPresentation are assigned to classI/II automatically

2.10 Select 4 patients and select the subset of genes that are expressed in our 4 patients

kegg_hla_4pt.mat <- boxplot_genelist_4pt %>%
  select(gene_symbol, P101, P103, P104, P108) %>%
  column_to_rownames("gene_symbol") %>%
  as.matrix()

2.11 Create column annotation

md_4pt <- data.frame(TPM_name = c("P101", "P103", "P104", "P108"),
                    Patient = c("P101", "P103", "P104", "P108")) %>%
  column_to_rownames("TPM_name")

2.12 Create heatmap of KEGG antigen presentation + processing, HLA class I, and HLA class II for 4 pt for S7E

# Double check that cells in metadata and matrix are in the same order
table(rownames(md_4pt) == colnames(kegg_hla_4pt.mat))

TRUE 
   4 
column_annot_4pt.ha = HeatmapAnnotation(Patient = md_4pt$Patient,
                                    col = list(Patient = c("P101" = "#59A14F", "P103" = "#B07AA1FF", "P104" = "#76B7B2FF", "P108" = "#F6AAC9FF")))

# Subset/Reorder so that cells in the gene list and matrix are in the same order
boxplot_genelist_4pt <- boxplot_genelist_4pt[match(rownames(kegg_hla_4pt.mat), boxplot_genelist_4pt$gene_symbol),] %>%
  mutate(Type = str_replace(Type, "AntigenProcessing\nPresentation", "AntigenProcessingPresentation"),
         Type = factor(Type, levels = c("ClassI", "ClassII", "AntigenProcessingPresentation")))

kegg_hla_4pt.log10 <- log10(kegg_hla_4pt.mat+1)

heatmap_viridis <- Heatmap(kegg_hla_4pt.log10,
        show_row_dend = FALSE,
        show_row_names = TRUE,
        row_names_gp = gpar(fontsize = 6),
        row_title_rot = 90,
        show_column_dend = FALSE,
        show_column_names = FALSE,
        top_annotation = column_annot_4pt.ha,
        row_split = boxplot_genelist_4pt$Type,
        cluster_row_slices = FALSE,
        name = "log10(TPM+1)",
        col = colorRamp2(c(0, 1, 1.4, 2.1, 2.4, 2.7, 2.9, 3.3), c("#440154FF", "#482878FF", "#3E4A89FF", "#35B779FF", "#6DCD59FF", "#B4DE2CFF", "#E3E418FF", "#FDE725FF")))

heatmap_viridis

2.13 Get session info

sessionInfo()
R version 4.3.2 (2023-10-31)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Rocky Linux 8.10 (Green Obsidian)

Matrix products: default
BLAS/LAPACK: /usr/lib64/libopenblasp-r0.3.15.so;  LAPACK version 3.9.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

time zone: America/New_York
tzcode source: system (glibc)

attached base packages:
[1] grid      stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] circlize_0.4.16       ComplexHeatmap_2.18.0 lubridate_1.9.4      
 [4] forcats_1.0.0         stringr_1.5.1         purrr_1.0.4          
 [7] readr_2.1.5           tidyr_1.3.1           tibble_3.2.1         
[10] tidyverse_2.0.0       ggplot2_3.5.1         dplyr_1.1.4          
[13] viridis_0.6.5         viridisLite_0.4.2     msigdbr_7.5.1        

loaded via a namespace (and not attached):
 [1] gtable_0.3.6        babelgene_22.9      shape_1.4.6.1      
 [4] rjson_0.2.23        xfun_0.50           htmlwidgets_1.6.4  
 [7] GlobalOptions_0.1.2 tzdb_0.5.0          Cairo_1.6-2        
[10] vctrs_0.6.5         tools_4.3.2         generics_0.1.3     
[13] stats4_4.3.2        parallel_4.3.2      cluster_2.1.8.1    
[16] pkgconfig_2.0.3     RColorBrewer_1.1-3  S4Vectors_0.40.2   
[19] lifecycle_1.0.4     compiler_4.3.2      farver_2.1.2       
[22] munsell_0.5.1       codetools_0.2-20    clue_0.3-66        
[25] htmltools_0.5.8.1   pillar_1.10.1       crayon_1.5.3       
[28] magick_2.8.6        iterators_1.0.14    foreach_1.5.2      
[31] tidyselect_1.2.1    digest_0.6.37       stringi_1.8.4      
[34] fastmap_1.2.0       colorspace_2.1-1    cli_3.6.3          
[37] magrittr_2.0.3      withr_3.0.2         scales_1.3.0       
[40] timechange_0.3.0    rmarkdown_2.29      matrixStats_1.5.0  
[43] gridExtra_2.3       png_0.1-8           GetoptLong_1.0.5   
[46] hms_1.1.3           evaluate_1.0.1      knitr_1.49         
[49] IRanges_2.36.0      doParallel_1.0.17   rlang_1.1.5        
[52] Rcpp_1.0.14         glue_1.8.0          BiocGenerics_0.48.1
[55] rstudioapi_0.17.1   jsonlite_1.8.9      R6_2.6.1